package com.example.demo.service;

import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicHttpResponse;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;

public class NromalSpider {

    public static void main(String[] args) throws IOException {
        execSpider();
    }


    /**
     * 爬取河南省贸促会的文章
     *
     * @throws IOException
     */
    public static void execSpider() throws IOException {
        System.out.println("爬取河南省贸促会的文章: " + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
        HttpClient httpClient = HttpClientBuilder.create().build();
        String url = "http://www.ccpit-henan.org/mcxw/index.jhtml";
        HttpResponse response = getHtml(httpClient, url);
        int sattusCode = response.getStatusLine().getStatusCode();
        // 200表示成功
        if (sattusCode == 200) {
            // 获取响应实体内容，并且将其转换为utf-8形式的字符串编码
            String entity = EntityUtils.toString(response.getEntity(), "utf-8");
            Document doc = Jsoup.parse(entity);
            Elements select = doc.select("div[class=box-content]").select("li");
            for (Element element : select) {
                String bt = element.select("span").text();
                String date = element.select("em").text();
                String href = element.select("a").attr("href");
                System.out.println(date + " " + bt + " " + href );
            }
        } else {
            // 释放资源实体
            EntityUtils.consume(response.getEntity());
        }
    }

    public static HttpResponse getHtml(HttpClient client, String url) {
        //获取响应文件，即HTML，采用get方法获取响应数据
        HttpGet getMethod = new HttpGet(url);
        //设置请求和传输超时时间
        RequestConfig requestConfig = RequestConfig.custom()
                .setSocketTimeout(5000)
                .setConnectTimeout(5000)
                .setConnectionRequestTimeout(5000)
                // 设置cookie规范 减少日志打印
                .setCookieSpec(CookieSpecs.IGNORE_COOKIES)
                .build();
        getMethod.setConfig(requestConfig);
        HttpResponse response = new BasicHttpResponse(HttpVersion.HTTP_1_1, HttpStatus.SC_OK, "OK");
        //通过client执行get方法
        try {
            response = client.execute(getMethod);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return response;
    }

}

